EXPLORATORY DATA ANALYSIS
Housing in Brazil 🇧🇷
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import csv
import os
print ("Imported Successuflly")
Imported Successuflly
In this project, there are two separate datasets of homes for sale in Brazil.
Import the two dataset needed for analysis
df1 = pd.read_csv("brasil_real_estate_1.csv", sep=',', encoding='latin-1')
df1.head()
| property_type | state | region | lat | lon | area_m2 | price_brl | |
|---|---|---|---|---|---|---|---|
| 0 | apartment | Pernambuco | Northeast | -8.134204 | -34.906326 | 72.0 | 414222.98 |
| 1 | apartment | Pernambuco | Northeast | -8.126664 | -34.903924 | 136.0 | 848408.53 |
| 2 | apartment | Pernambuco | Northeast | -8.125550 | -34.907601 | 75.0 | 299438.28 |
| 3 | apartment | Pernambuco | Northeast | -8.120249 | -34.895920 | 187.0 | 848408.53 |
| 4 | apartment | Pernambuco | Northeast | -8.142666 | -34.906906 | 80.0 | 464129.36 |
df2 = pd.read_csv("brasil_real_estate_2.csv", sep=',', encoding='latin-1')
df2.head()
| property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | |
|---|---|---|---|---|---|---|
| 0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110 | $187,230.85 |
| 1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65 | $81,133.37 |
| 2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211 | $154,465.45 |
| 3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99 | $146,013.20 |
| 4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55 | $101,416.71 |
df1.info()
df1.isnull().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12833 entries, 0 to 12832 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_type 12833 non-null object 1 state 12833 non-null object 2 region 12833 non-null object 3 lat 12833 non-null float64 4 lon 12833 non-null float64 5 area_m2 11293 non-null float64 6 price_brl 12833 non-null float64 dtypes: float64(4), object(3) memory usage: 701.9+ KB
property_type 0 state 0 region 0 lat 0 lon 0 area_m2 1540 price_brl 0 dtype: int64
The column [area_m2] has missing values of 1540
df2.info()
df2.isnull().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12834 entries, 0 to 12833 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_type 12834 non-null object 1 place_with_parent_names 12834 non-null object 2 region 12834 non-null object 3 lat-lon 11551 non-null object 4 area_m2 12834 non-null int64 5 price_usd 12834 non-null object dtypes: int64(1), object(5) memory usage: 601.7+ KB
property_type 0 place_with_parent_names 0 region 0 lat-lon 1283 area_m2 0 price_usd 0 dtype: int64
The column [lat-lon] has missing values of 1283
df1.dropna(inplace = True)
df1
| property_type | state | region | lat | lon | area_m2 | price_brl | |
|---|---|---|---|---|---|---|---|
| 0 | apartment | Pernambuco | Northeast | -8.134204 | -34.906326 | 72.0 | 414222.98 |
| 1 | apartment | Pernambuco | Northeast | -8.126664 | -34.903924 | 136.0 | 848408.53 |
| 2 | apartment | Pernambuco | Northeast | -8.125550 | -34.907601 | 75.0 | 299438.28 |
| 3 | apartment | Pernambuco | Northeast | -8.120249 | -34.895920 | 187.0 | 848408.53 |
| 4 | apartment | Pernambuco | Northeast | -8.142666 | -34.906906 | 80.0 | 464129.36 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 12827 | house | São Paulo | Southeast | -23.595098 | -46.796448 | 180.0 | 419213.60 |
| 12828 | house | São Paulo | Southeast | -23.587495 | -46.559401 | 250.0 | 429194.89 |
| 12829 | apartment | São Paulo | Southeast | -23.522029 | -46.189290 | 55.0 | 252398.80 |
| 12830 | apartment | São Paulo | Southeast | -23.526443 | -46.529182 | 57.0 | 319400.84 |
| 12832 | apartment | Tocantins | North | -10.249091 | -48.324286 | 70.0 | 289457.01 |
11293 rows × 7 columns
df2.head(2)
| property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | |
|---|---|---|---|---|---|---|
| 0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110 | $187,230.85 |
| 1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65 | $81,133.37 |
df2[["lat", "lon"]] = df2["lat-lon"].str.split(",", expand = True).astype(float)
df2
| property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | lat | lon | |
|---|---|---|---|---|---|---|---|---|
| 0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110 | $187,230.85 | -9.644305 | -35.708814 |
| 1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65 | $81,133.37 | -9.643093 | -35.704840 |
| 2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211 | $154,465.45 | -9.622703 | -35.729795 |
| 3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99 | $146,013.20 | -9.622837 | -35.719556 |
| 4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55 | $101,416.71 | -9.654955 | -35.700227 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12829 | apartment | |Brasil|Pernambuco|Recife| | Northeast | -8.056418,-34.909309 | 91 | $174,748.79 | -8.056418 | -34.909309 |
| 12830 | apartment | |Brasil|Pernambuco|Recife| | Northeast | -8.1373477,-34.909181 | 115 | $115,459.02 | -8.137348 | -34.909181 |
| 12831 | apartment | |Brasil|Pernambuco|Recife|Boa Viagem| | Northeast | -8.1136717,-34.896252 | 76 | $137,302.62 | -8.113672 | -34.896252 |
| 12832 | apartment | |Brasil|Pernambuco|Recife|Boa Viagem| | Northeast | NaN | 130 | $234,038.56 | NaN | NaN |
| 12833 | apartment | |Brasil|Pernambuco|Recife|Boa Viagem| | Northeast | -8.0578381,-34.882897 | 99 | $168,507.77 | -8.057838 | -34.882897 |
12834 rows × 8 columns
df2["state"] = df2["place_with_parent_names"].str.split("|", expand=True)[2]
df2.head(2)
| property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | lat | lon | state | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110 | $187,230.85 | -9.644305 | -35.708814 | Alagoas |
| 1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65 | $81,133.37 | -9.643093 | -35.704840 | Alagoas |
del df2["place_with_parent_names"]
del df2["lat-lon"]
df2
| property_type | region | area_m2 | price_usd | lat | lon | state | |
|---|---|---|---|---|---|---|---|
| 0 | apartment | Northeast | 110 | $187,230.85 | -9.644305 | -35.708814 | Alagoas |
| 1 | apartment | Northeast | 65 | $81,133.37 | -9.643093 | -35.704840 | Alagoas |
| 2 | house | Northeast | 211 | $154,465.45 | -9.622703 | -35.729795 | Alagoas |
| 3 | apartment | Northeast | 99 | $146,013.20 | -9.622837 | -35.719556 | Alagoas |
| 4 | apartment | Northeast | 55 | $101,416.71 | -9.654955 | -35.700227 | Alagoas |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 12829 | apartment | Northeast | 91 | $174,748.79 | -8.056418 | -34.909309 | Pernambuco |
| 12830 | apartment | Northeast | 115 | $115,459.02 | -8.137348 | -34.909181 | Pernambuco |
| 12831 | apartment | Northeast | 76 | $137,302.62 | -8.113672 | -34.896252 | Pernambuco |
| 12832 | apartment | Northeast | 130 | $234,038.56 | NaN | NaN | Pernambuco |
| 12833 | apartment | Northeast | 99 | $168,507.77 | -8.057838 | -34.882897 | Pernambuco |
12834 rows × 7 columns
df2["price_usd"] = df2["price_usd"].str.replace("$", "").str.replace(",", "").astype(float)
df2
C:\Users\DAVID\AppData\Local\Temp\ipykernel_14600\2100640017.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df2["price_usd"] = df2["price_usd"].str.replace("$", "").str.replace(",", "").astype(float)
| property_type | region | area_m2 | price_usd | lat | lon | state | |
|---|---|---|---|---|---|---|---|
| 0 | apartment | Northeast | 110 | 187230.85 | -9.644305 | -35.708814 | Alagoas |
| 1 | apartment | Northeast | 65 | 81133.37 | -9.643093 | -35.704840 | Alagoas |
| 2 | house | Northeast | 211 | 154465.45 | -9.622703 | -35.729795 | Alagoas |
| 3 | apartment | Northeast | 99 | 146013.20 | -9.622837 | -35.719556 | Alagoas |
| 4 | apartment | Northeast | 55 | 101416.71 | -9.654955 | -35.700227 | Alagoas |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 12829 | apartment | Northeast | 91 | 174748.79 | -8.056418 | -34.909309 | Pernambuco |
| 12830 | apartment | Northeast | 115 | 115459.02 | -8.137348 | -34.909181 | Pernambuco |
| 12831 | apartment | Northeast | 76 | 137302.62 | -8.113672 | -34.896252 | Pernambuco |
| 12832 | apartment | Northeast | 130 | 234038.56 | NaN | NaN | Pernambuco |
| 12833 | apartment | Northeast | 99 | 168507.77 | -8.057838 | -34.882897 | Pernambuco |
12834 rows × 7 columns
df1["price_usd"] = (df1['price_brl'] / 3.19).round(2)
df1
| property_type | state | region | lat | lon | area_m2 | price_brl | price_usd | |
|---|---|---|---|---|---|---|---|---|
| 0 | apartment | Pernambuco | Northeast | -8.134204 | -34.906326 | 72.0 | 414222.98 | 129850.46 |
| 1 | apartment | Pernambuco | Northeast | -8.126664 | -34.903924 | 136.0 | 848408.53 | 265958.79 |
| 2 | apartment | Pernambuco | Northeast | -8.125550 | -34.907601 | 75.0 | 299438.28 | 93867.80 |
| 3 | apartment | Pernambuco | Northeast | -8.120249 | -34.895920 | 187.0 | 848408.53 | 265958.79 |
| 4 | apartment | Pernambuco | Northeast | -8.142666 | -34.906906 | 80.0 | 464129.36 | 145495.10 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12827 | house | São Paulo | Southeast | -23.595098 | -46.796448 | 180.0 | 419213.60 | 131414.92 |
| 12828 | house | São Paulo | Southeast | -23.587495 | -46.559401 | 250.0 | 429194.89 | 134543.85 |
| 12829 | apartment | São Paulo | Southeast | -23.522029 | -46.189290 | 55.0 | 252398.80 | 79121.88 |
| 12830 | apartment | São Paulo | Southeast | -23.526443 | -46.529182 | 57.0 | 319400.84 | 100125.66 |
| 12832 | apartment | Tocantins | North | -10.249091 | -48.324286 | 70.0 | 289457.01 | 90738.87 |
11293 rows × 8 columns
df1.drop(columns = ["price_brl"], inplace = True)
df1.isnull().sum()
property_type 0 state 0 region 0 lat 0 lon 0 area_m2 0 price_usd 0 dtype: int64
df2.isnull().sum()
property_type 0 region 0 area_m2 0 price_usd 0 lat 1283 lon 1283 state 0 dtype: int64
df2.dropna(inplace= True)
df2
| property_type | region | area_m2 | price_usd | lat | lon | state | |
|---|---|---|---|---|---|---|---|
| 0 | apartment | Northeast | 110 | 187230.85 | -9.644305 | -35.708814 | Alagoas |
| 1 | apartment | Northeast | 65 | 81133.37 | -9.643093 | -35.704840 | Alagoas |
| 2 | house | Northeast | 211 | 154465.45 | -9.622703 | -35.729795 | Alagoas |
| 3 | apartment | Northeast | 99 | 146013.20 | -9.622837 | -35.719556 | Alagoas |
| 4 | apartment | Northeast | 55 | 101416.71 | -9.654955 | -35.700227 | Alagoas |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 12828 | apartment | Northeast | 74 | 134182.11 | -8.044497 | -34.909519 | Pernambuco |
| 12829 | apartment | Northeast | 91 | 174748.79 | -8.056418 | -34.909309 | Pernambuco |
| 12830 | apartment | Northeast | 115 | 115459.02 | -8.137348 | -34.909181 | Pernambuco |
| 12831 | apartment | Northeast | 76 | 137302.62 | -8.113672 | -34.896252 | Pernambuco |
| 12833 | apartment | Northeast | 99 | 168507.77 | -8.057838 | -34.882897 | Pernambuco |
11551 rows × 7 columns
df = pd.concat([df1, df2])
print("df shape:", df.shape)
df shape: (22844, 7)
df
Now, the data will be explored. Data visualization skills will be used to learn more about the regional differences in the Brazilian real estate market.
fig = px.scatter_mapbox(
df,
lat="lat",
lon="lon",
center={"lat": -14.2, "lon": -51.9}, # Map will be centered on Brazil
width=1000,
height=600,
hover_data=["price_usd"], # Display price when hovering mouse over house
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()
stats_summary = df[["area_m2","price_usd"]].describe()
stats_summary
| area_m2 | price_usd | |
|---|---|---|
| count | 22844.000000 | 22844.000000 |
| mean | 115.020224 | 194987.315515 |
| std | 47.742932 | 103617.682979 |
| min | 53.000000 | 74892.340000 |
| 25% | 76.000000 | 113898.770000 |
| 50% | 103.000000 | 165697.555000 |
| 75% | 142.000000 | 246900.882500 |
| max | 252.000000 | 525659.720000 |
# Build histogram
plt.hist(df["price_usd"])
# Label axes
plt.xlabel("Price [USD]")
plt.ylabel("Frequency")
# Add title
plt.title("Distribution of Home Prices")
plt.grid()
# Don't change the code below 👇
plt.savefig("Hist1.png", dpi=150)
# Build box plot
plt.boxplot(df["area_m2"], vert = False, )
# Label x-axis
plt.xlabel("Area [sq meters]")
# Add title
plt.title("Distribution of Home Sizes")
# Don't change the code below 👇
plt.savefig("Box plot.png", dpi=150)
average_home_prices_by_regions = df.groupby(by= "region").price_usd.mean().sort_values(ascending = True)
average_home_prices_by_regions
region Central-West 178596.283663 North 181308.958188 Northeast 185422.985482 South 189012.345360 Southeast 208996.762761 Name: price_usd, dtype: float64
average_home_prices_by_regions.plot(kind="bar", xlabel = "Regions", ylabel = "Average Home Prices", title = "Bar Chart for Average Home Prices by Region")
plt.xticks(rotation = 20)
plt.yticks(rotation = 0)
plt.grid()
# image
plt.savefig("Bar Chart.png", dpi=150)
Subsetting/Creating DataFrame for houses in only southern region of Brazil
df_south = df[df["region"]=="South"]
df_south
| property_type | state | region | lat | lon | area_m2 | price_usd | |
|---|---|---|---|---|---|---|---|
| 743 | house | Rio Grande do Sul | South | -30.027105 | -51.130470 | 188.0 | 115770.29 |
| 745 | apartment | Rio Grande do Sul | South | -30.039816 | -51.223164 | 65.0 | 123430.14 |
| 746 | apartment | Rio Grande do Sul | South | -29.696850 | -53.858382 | 142.0 | 185145.22 |
| 748 | apartment | Rio Grande do Sul | South | -30.033820 | -51.198596 | 151.0 | 256572.00 |
| 750 | apartment | Rio Grande do Sul | South | -30.034061 | -51.135494 | 68.0 | 75957.01 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 12113 | apartment | Paraná | South | -25.404114 | -49.250252 | 105.0 | 179585.59 |
| 12114 | apartment | Paraná | South | -25.461021 | -49.281609 | 150.0 | 327970.10 |
| 12115 | apartment | Paraná | South | -25.446156 | -49.266403 | 62.0 | 148142.66 |
| 12116 | apartment | Paraná | South | -25.446156 | -49.266403 | 85.0 | 211816.76 |
| 12119 | apartment | Paraná | South | -25.461010 | -49.330349 | 62.0 | 93615.42 |
7821 rows × 7 columns
df_south["state"].value_counts().sort_values(ascending=False)
Rio Grande do Sul 2643 Santa Catarina 2634 Paraná 2544 Name: state, dtype: int64
# Subset data
df_south_rgs = df_south[df_south["state"] == "Rio Grande do Sul"]
# Build scatter plot
plt.scatter(x= df_south_rgs["area_m2"], y = df_south_rgs["price_usd"])
# Label axes
plt.title("Price vs. Area")
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
# Add title
plt.title("Rio Grande do Sul: Price vs. Area")
plt.grid()
# Don't change the code below 👇
plt.savefig("scatter.png", dpi=150)
a= df_south[df_south["state"]=='Paraná']
b= df_south[df_south["state"]=='Rio Grande do Sul']
c= df_south[df_south["state"]=='Santa Catarina']
south_states_corr = {'Paraná': a["area_m2"].corr(a["price_usd"]), 'Rio Grande do Sul': b["area_m2"].corr(b["price_usd"]), 'Santa Catarina': c["area_m2"].corr(c["price_usd"])}
south_states_corr
{'Paraná': 0.5436659935502658,
'Rio Grande do Sul': 0.5773267433871903,
'Santa Catarina': 0.5068121769989855}